This analysis is to look at the MDS samples that were treated with the drug HTT homoharringtonine which is a translation inhibitor. There were only 2 samples for each treatment in this arm.
library(dplyr)
library(ggplot2)
library(DESeq2)
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
library(fgsea)
library(limma)
##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics
| Sample | patient | reads | %Q30 | Duplication rate | % reads with adapter | STAR alignment number | percent aligned | splices annotated | salmon mapping |
|---|---|---|---|---|---|---|---|---|---|
| CD123+ | HTB336 | 28161997 | 93 | 40 | 2.7 | 24795059 | 88.04 | 8802685 | 82.8170 |
| CD123+ | HTB61 | 17643949 | 90 | 29 | 2.5 | 13932164 | 78.96 | 3351481 | 68.1966 |
| CD123+ HHT | HTB336 | 17398408 | 93 | 42 | 2.7 | 15452341 | 88.81 | 5925372 | 87.9806 |
| CD123+ HHT | HTB61 | 22970807 | 93 | 33 | 2.6 | 20607839 | 89.71 | 4897461 | 75.8607 |
##This section of code will import and format the data for DeSeq2
#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/counts.rRNA/HHT/", list.files("~/Desktop/Jordan files/counts.rRNA/HHT/"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/counts.rRNA/HHT")
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host='www.ensembl.org')
t2g_hs <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
tx2gene <- t2g_hs[,c(1,2)]
tx2gene[nrow(tx2gene)+1,]=c('rRNA_45S_NR145819','rRNA_45S_NR145819')
# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= tx2gene, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file
# Extract counts only
counts <- txi.salmon.g$counts %>%
as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)
##for clients the counts and tpm files should be written out
metadata <- read_excel("~/Desktop/Jordan files/counts.rRNA/meta.xlsx")
sums<-colSums(txi.salmon.g$counts)
sums<-as.data.frame(sums)
counts.rRNA<-as.data.frame(txi.salmon.g$counts)
rna.counts<-as.data.frame(t(slice_tail(counts.rRNA)))
rna.counts <-merge(rna.counts,sums,by='row.names',all=TRUE)
rna.counts<-mutate(rna.counts, "percentrRNA"= (rRNA_45S_NR145819 /sums)*100 )
names(rna.counts)[1] <- 'SampleName'
rna.counts<-select(rna.counts, c("SampleName", "percentrRNA"))
metadata<-dplyr::full_join(metadata,rna.counts, by="SampleName")
metadata$percentrRNA <- round(metadata$percentrRNA ,digit=2)
expressed_genes<-tpms %>% filter_all(all_vars(. > 5))
##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap
exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))
coldata<-metadata
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] FALSE
coldata<-coldata[colnames(txi.salmon.g$counts),]
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
dds <- DESeqDataSetFromTximport(txi.salmon.g,
colData = coldata,
design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
dds<-estimateSizeFactors(dds)
sf<-as.data.frame(dds$sizeFactor)
sf<-rownames_to_column(sf, var="SampleName")
metadata<-inner_join(metadata, sf, by="SampleName")
names(metadata)[5] <- "sizeFactor"
metadata$sizeFactor <- round(metadata$sizeFactor ,digit=2)
### unfiltered data
dds.unfiltered <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)
##filtered data changed this from dds[rowMins(counts(dds))>5,] to a less stringent filtering
keep<-rowSums(counts(dds))>=10
#dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-dds[keep,]
dds.filtered<-DESeq(dds.filtered)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)
###outcome the first filtering cut from 60000 genese to 14000 genes with the new filter 35000genes are left which is about 1/2
### need to subset to do pairwise comparison unclear if this keeps the patient design aspect
res_123vs123HHT_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123pos_HHT"))
res_123vs123HHT_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123pos_HHT"))
## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123vs123HHT_unfiltered$pvalue < 0.01 & abs(res_123vs123HHT_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 738
sum(res_123vs123HHT_filtered$pvalue < 0.01 & abs(res_123vs123HHT_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 726
sum(res_123vs123HHT_unfiltered$padj < 0.01, na.rm=TRUE)
## [1] 127
sum(res_123vs123HHT_filtered$padj < 0.01, na.rm=TRUE)
## [1] 132
sum(res_123vs123HHT_unfiltered$padj < 0.05, na.rm=TRUE)
## [1] 304
sum(res_123vs123HHT_filtered$padj < 0.05, na.rm=TRUE)
## [1] 295
## Warning: Removed 45307 rows containing missing values (geom_point).
## Warning: Removed 7688 rows containing missing values (geom_point).
###MA plots
pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar2 <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType,percent=metadata$percentrRNA)
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")
ggplot(pca_df, aes(PC1, PC2, color=type, shape=metadata$patient))+geom_point(size=3)+
xlab(paste0("PC1: ",round(percentVar2$percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar2$percentVar[2] * 100), "% variance"))+
scale_color_manual(values=alt_col_values)+labs(color="")+theme_bw()+
ggtitle("PCA of analyzed data")
##pull gene names for gsea analysis
ens2gene <- t2g_hs[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)
#loading hallmark and KEGG pathways
pathways.hallmark <- gmtPathways("~/Desktop/Jordan files/h.all.v7.4.symbols.gmt")
pathways.kegg<-gmtPathways("~/Desktop/Jordan files/c2.cp.kegg.v7.4.symbols.gmt")
##generating tidy data and adding it to the results file
res_123vs123HHT_filtered_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "123pos_HHT"),tidy=TRUE)
colnames(res_123vs123HHT_filtered_gsea)[1]<-"ensembl_gene_id"
res_123vs123HHT_filtered_gsea<-inner_join(res_123vs123HHT_filtered_gsea, ens2gene, by="ensembl_gene_id")
res_123vs123HHT_filtered_gsea_sum<-res_123vs123HHT_filtered_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_posHHT<-deframe(res_123vs123HHT_filtered_gsea_sum)
fgsea_posHHT_hallmark<- fgsea(pathways=pathways.hallmark, stats=rank_posHHT)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (4.03% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
fgsea_posHHT_KEGG<-fgsea(pathways=pathways.kegg, stats=rank_posHHT)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (4.03% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : For some pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for better estimation.
fgseaRes_posHHT_tidy <-fgsea_posHHT_hallmark%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posHHT_kegg_tidy <-fgsea_posHHT_KEGG%>%
as_tibble() %>%
arrange(desc(NES))
##Displaying a table of ordered pathways
fgseaRes_posHHT_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs 123posHHT.')
fgseaRes_posHHT_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathway genes with 123pos vs 123posHHT.')
#plot the waterfall results
ggplot(fgseaRes_posHHT_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.01)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123pos vs 123posHHT") +
theme_minimal()
ggplot(fgseaRes_posHHT_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.01)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123pos vs 123posHHT") +
theme_minimal()